# Title: Measuring Interethnic Marriage in Africa
# Author: Branden Bohrnsen, bohrnsen@umich.edu
# Date: 1/30/2026
# Description: Calculate diversity metrics including HHI at all levels of analysis.

library(tidyverse)
library(here)
census <- read_csv(here("data", "clean", "census.csv")) %>% filter(Sex %in% c("F", "Female"))
census <- census %>% mutate(across(where(is.factor), as.character))

calculate_hhi <- function(data, group_var) {
  data %>%
    filter(!is.na(.data[[group_var]])) %>%
    group_by(.data[[group_var]]) %>%
    summarise(n = n(), .groups = 'drop') %>%
    mutate(
      p = n / sum(n),
      hhi = sum(p^2)
    ) %>%
    slice(1) %>%
    pull(hhi)
}

n_group_national <- census %>% filter(!is.na(Ethnicity)) %>% nrow()
hhi_group_national <- calculate_hhi(census, "Ethnicity")
n_block_national <- census %>% filter(!is.na(EthnicBlock)) %>% nrow()
hhi_block_national <- calculate_hhi(census, "EthnicBlock")

census_urban_gte <- census %>% filter(Urbanization >= 0.5)
n_group_urban_gte <- census_urban_gte %>% filter(!is.na(Ethnicity)) %>% nrow()
hhi_group_urban_gte <- calculate_hhi(census_urban_gte, "Ethnicity")
n_block_urban_gte <- census_urban_gte %>% filter(!is.na(EthnicBlock)) %>% nrow()
hhi_block_urban_gte <- calculate_hhi(census_urban_gte, "EthnicBlock")

census_urban_lte <- census %>% filter(Urbanization <= 0.5)
n_group_urban_lte <- census_urban_lte %>% filter(!is.na(Ethnicity)) %>% nrow()
hhi_group_urban_lte <- calculate_hhi(census_urban_lte, "Ethnicity")
n_block_urban_lte <- census_urban_lte %>% filter(!is.na(EthnicBlock)) %>% nrow()
hhi_block_urban_lte <- calculate_hhi(census_urban_lte, "EthnicBlock")

hhi_summary_table <- tibble::tibble(
  Level = c("National", "Urban", "Rural"),
  N_Group = c(n_group_national, n_group_urban_gte, n_group_urban_lte),
  Ethnic_Group_HHI = c(hhi_group_national, hhi_group_urban_gte, hhi_group_urban_lte),
  N_Block = c(n_block_national, n_block_urban_gte, n_block_urban_lte),
  Ethnic_Block_HHI = c(hhi_block_national, hhi_block_urban_gte, hhi_block_urban_lte)
)

decades <- c("50s", "60s", "70s", "80s", "90s", "00s")
decade_labels <- c("1951-1960", "1961-1970", "1971-1980", "1981-1990", "1991-2000", "2001-2010")

hhi_by_decade_list <- list()

for (i in seq_along(decades)) {
  decade <- decades[i]
  decade_label <- decade_labels[i]
  in_market_col <- paste0("InMarket_", decade)

  census_decade <- census %>% filter(.data[[in_market_col]] == 1)

  n_group <- census_decade %>% filter(!is.na(Ethnicity)) %>% nrow()
  hhi_group <- calculate_hhi(census_decade, "Ethnicity")
  n_block <- census_decade %>% filter(!is.na(EthnicBlock)) %>% nrow()
  hhi_block <- calculate_hhi(census_decade, "EthnicBlock")

  hhi_by_decade_list[[i]] <- tibble::tibble(
    Decade = decade_label,
    N_Group = n_group,
    Ethnic_Group_HHI = hhi_group,
    N_Block = n_block,
    Ethnic_Block_HHI = hhi_block
  )
}

hhi_by_decade_table <- do.call(rbind, hhi_by_decade_list)

census_long_decade <- census %>%
  pivot_longer(
    cols = c(
      "InMarket_50s", "InMarket_60s", "InMarket_70s",
      "InMarket_80s", "InMarket_90s", "InMarket_00s"
    ),
    names_to = "Decade_Raw",
    values_to = "InMarket"
  ) %>% 
  filter(InMarket == 1) %>% 
  mutate(
      Decade = case_when(
          Decade_Raw == "InMarket_50s" ~ "1951-1960",
          Decade_Raw == "InMarket_60s" ~ "1961-1970",
          Decade_Raw == "InMarket_70s" ~ "1971-1980",
          Decade_Raw == "InMarket_80s" ~ "1981-1990",
          Decade_Raw == "InMarket_90s" ~ "1991-2000",
          Decade_Raw == "InMarket_00s" ~ "2001-2010"
      )
  ) %>% 
  select(-InMarket)


hhi_by_const_decade <- census_long_decade %>%
  group_by(Const, District, Decade) %>%
  nest() %>%
  mutate(
    N_Ethnicity = map_int(data, ~sum(!is.na(.$Ethnicity))),
    HHI_Ethnicity = map_dbl(data, ~calculate_hhi(., "Ethnicity")),
    N_EthnicBlock = map_int(data, ~sum(!is.na(.$EthnicBlock))),
    HHI_EthnicBlock = map_dbl(data, ~calculate_hhi(., "EthnicBlock"))
  ) %>%
  select(-data)

n_gc_ethnicity <- census_long_decade %>%
  filter(!is.na(Ethnicity)) %>%
  group_by(Ethnicity, Const, District, Decade) %>%
  summarise(n_gc = n(), .groups = "drop")

n_gc_ethnic_block <- census_long_decade %>%
  filter(!is.na(EthnicBlock)) %>%
  group_by(EthnicBlock, Const, District, Decade) %>%
  summarise(n_gc = n(), .groups = "drop")

total_n_ethnicity <- nrow(census_long_decade %>% filter(!is.na(Ethnicity)))
total_n_ethnic_block <- nrow(census_long_decade %>% filter(!is.na(EthnicBlock)))

sum_n_const_decade_ethnicity <- sum(hhi_by_const_decade$N_Ethnicity)
sum_n_const_decade_ethnic_block <- sum(hhi_by_const_decade$N_EthnicBlock)

sum_n_gc_ethnicity <- sum(n_gc_ethnicity$n_gc)
sum_n_gc_ethnic_block <- sum(n_gc_ethnic_block$n_gc)

verification_summary <- tibble::tibble(
  Description = c(
    "Total Person-Decades (Ethnicity)",
    "Sum from Constituency-Decade Table (Ethnicity)",
    "Sum from Group-Constituency-Decade Table (Ethnicity)",
    "---",
    "Total Person-Decades (Ethnic Block)",
    "Sum from Constituency-Decade Table (Ethnic Block)",
    "Sum from Group-Constituency-Decade Table (Ethnic Block)"
  ),
  N = c(
    total_n_ethnicity,
    sum_n_const_decade_ethnicity,
    sum_n_gc_ethnicity,
    NA,
    total_n_ethnic_block,
    sum_n_const_decade_ethnic_block,
    sum_n_gc_ethnic_block
  )
)

hhi_ethnicity_quintiles <- quantile(
  hhi_by_const_decade$HHI_Ethnicity,
  probs = seq(0, 1, 0.2),
  na.rm = TRUE
)

hhi_ethnic_block_quintiles <- quantile(
  hhi_by_const_decade$HHI_EthnicBlock,
  probs = seq(0, 1, 0.2),
  na.rm = TRUE
)

hhi_quintiles_table <- tibble::tibble(
  Quintile = names(hhi_ethnicity_quintiles),
  HHI_Ethnicity = hhi_ethnicity_quintiles,
  HHI_EthnicBlock = hhi_ethnic_block_quintiles
)

elf_by_const_time_invariant <- census %>%
  group_by(Const, District) %>%
  nest() %>%
  mutate(
    ELF_Ethnicity = 1 - map_dbl(data, ~calculate_hhi(., "Ethnicity")),
    ELF_EthnicBlock = 1 - map_dbl(data, ~calculate_hhi(., "EthnicBlock")),
    HHI_Ethnicity = map_dbl(data, ~calculate_hhi(., "Ethnicity")),
    HHI_EthnicBlock = map_dbl(data, ~calculate_hhi(., "EthnicBlock")),
    n()
  ) %>%
  select(-data)

elf_ethnicity_quintiles_ti <- quantile(
  elf_by_const_time_invariant$ELF_Ethnicity,
  probs = seq(0, 1, 0.2),
  na.rm = TRUE
)

hhi_ethnicity_quintiles_ti <- quantile(
  elf_by_const_time_invariant$HHI_Ethnicity,
  probs = seq(0, 1, 0.2),
  na.rm = TRUE
)

elf_ethnic_block_quintiles_ti <- quantile(
  elf_by_const_time_invariant$ELF_EthnicBlock,
  probs = seq(0, 1, 0.2),
  na.rm = TRUE
)

hhi_ethnic_block_quintiles_ti <- quantile(
  elf_by_const_time_invariant$HHI_EthnicBlock,
  probs = seq(0, 1, 0.2),
  na.rm = TRUE
)


elf_quintiles_table_ti <- tibble::tibble(
  Quintile = names(elf_ethnicity_quintiles_ti),
  ELF_Ethnicity_TimeInvariant = elf_ethnicity_quintiles_ti,
  ELF_EthnicBlock_TimeInvariant = elf_ethnic_block_quintiles_ti,
  HHI_Ethnicity_TimeInvariant = hhi_ethnicity_quintiles_ti,
  HHI_EthnicBlock_TimeInvariant = hhi_ethnic_block_quintiles_ti
)

weighted.mean(elf_by_const_time_invariant$ELF_EthnicBlock, elf_by_const_time_invariant$`n()`)
weighted.mean(elf_by_const_time_invariant$ELF_Ethnicity, elf_by_const_time_invariant$`n()`)

if (!dir.exists(here("out", "inter"))) {
  dir.create(here("out", "inter"), recursive = TRUE)
}

write_csv(hhi_summary_table, here("out", "inter", "hhi_by_level.csv"))
write_csv(hhi_by_decade_table, here("out", "inter", "hhi_by_decade.csv"))
write_csv(hhi_by_const_decade, here("out", "inter", "hhi_by_const_decade.csv"))
write_csv(n_gc_ethnicity, here("out", "inter", "n_gc_ethnicity.csv"))
write_csv(n_gc_ethnic_block, here("out", "inter", "n_gc_ethnic_block.csv"))
write_csv(hhi_quintiles_table, here("out", "inter", "hhi_quintiles_by_decade.csv"))
write_csv(elf_by_const_time_invariant, here("out", "inter", "hhi_by_const_time_invariant.csv"))
write_csv(elf_quintiles_table_ti, here("out", "inter", "hhi_quintiles_time_invariant.csv"))